library("rstudioapi")     
setwd(dirname(getActiveDocumentContext()$path))
setwd("../")
getwd()
library("readxl") 
require(miceadds)
library(data.table)
library(tidyverse)
library(DescTools)
library(htmlTable)
library(stargazer)
library(xlsx)
library(estimatr)

# Load data: 
new_data = read.csv(file = "Data/full_data.csv")

### Data modification section: 
# we create various variables, relevel variables for regression models: 
# rename the main data set:
final_finalV2 = new_data

## New variables created: 
# Attended - measure of level of education:
final_finalV2$Attended = ifelse(final_finalV2$Q2.5 == "Never attended", 1, 0)

# New education: 
Never = c("Never attended")
Low <- c("Nursery","Kindergarten","Primary","JSS/JHS")
Medium <- c("Middle","SSS/SHS","Vocational/Technical/Commercial","Post middle/secondary certificate",
            "Post-secondary diploma")
High <- c("Bachelor degree","Post graduate (Cert. Diploma Masters PHD etc)")
final_finalV2$Education = ifelse(final_finalV2$Q2.5 %in% Never, "Never attended",
                                 ifelse(final_finalV2$Q2.5 %in% Low, "Low",
                                        ifelse(final_finalV2$Q2.5 %in% Medium, "Medium", 
                                               ifelse(final_finalV2$Q2.5 %in% High, "High", NA))))
# Gender variable: 
final_finalV2$Gender = ifelse(final_finalV2$Q2.3 == "Prefer not to say", NA,
                              ifelse(final_finalV2$Q2.3 == "Female", "Female",
                                     ifelse(final_finalV2$Q2.3 == "Male", "Male", NA)))
# Vaccine Status - phase I: 
colnames(final_finalV2)[which(colnames(final_finalV2) =="vaccine")] = "vaccine_intention"

# Vaccine Reported - phase II: 
final_finalV2$vaccine_reported_ii = ifelse(final_finalV2$p_ii_Q8.1 == "Yes, I have received a COVID-19 vaccine", 1 ,
                                           ifelse(final_finalV2$p_ii_Q8.1 == "No, I have not received a COVID-19 vaccine", 0 , NA))
# Vaccine Reported - phase III: 
final_finalV2$vaccine_reported_iii = ifelse(final_finalV2$p_iii_Q8.1 == "Yes, I have received at least one shot of a COVID-19 vaccine", 1 ,
                                            ifelse(final_finalV2$p_iii_Q8.1 == "No, I have not received at least one shot of a COVID-19 vaccine", 0 , NA))
# Vaccine Reported - combined II and III: 
final_finalV2$vaccine_reported_combo <- ifelse(!is.na(final_finalV2$vaccine_reported_ii), final_finalV2$vaccine_reported_ii, final_finalV2$vaccine_reported_iii)

# Create cash variable on individual level:
# convert Low Cash and High Cash into "cash" variable
final_finalV2$cash = ifelse(final_finalV2[,"individual_treatment"] == "Placebo", "placebo" ,
                            ifelse(final_finalV2[,"individual_treatment"] == "CDC Health", "CDC" ,
                                   ifelse(final_finalV2[,"individual_treatment"] == "Low Cash", "cash",
                                          ifelse(final_finalV2[,"individual_treatment"] == "High Cash", "cash", NA))))
# Create cash variable on village level:
# convert Low Cash and High Cash into "cash" variable
final_finalV2$Vcash = ifelse(final_finalV2[,"VillageTreatment"] == "Placebo", "placebo" ,
                             ifelse(final_finalV2[,"VillageTreatment"] == "CDC health", "CDC" ,
                                    ifelse(final_finalV2[,"VillageTreatment"] == "Low Cash", "cash",
                                           ifelse(final_finalV2[,"VillageTreatment"] == "High Cash", "cash", NA))))

# Create Ncash variable basec on Vcash and cash variables 
# placebo and cash converted into cash_placebo
# placebo and CDC converted into CDC_placebo
final_finalV2$Ncash = NA
for(i in 1:length(final_finalV2$Ncash)){
  control = 0 # it is a control variable because we check final_final$cash == "placebo" twice
  # and both statements are somehow true
  if(final_finalV2$cash[i] %in% "placebo" & final_finalV2$Vcash[i] %in% "cash"){
    final_finalV2$Ncash[i] = "cash_placebo"
    control = 1
  }else{
    if(final_finalV2$cash[i] %in% "placebo" & final_finalV2$Vcash[i] %in% "CDC"){
      final_finalV2$Ncash[i] = "CDC_placebo"
      control = 1
    }else{
      if(control == 0 & final_finalV2$cash[i] %in% "placebo"){
        final_finalV2$Ncash[i] = "placebo"
      }else{
        if(final_finalV2$cash[i] %in% "CDC" & control == 0){
          final_finalV2$Ncash[i] = "CDC"
        }else{ if(final_finalV2$cash[i] %in% "cash" & control == 0){
          final_finalV2$Ncash[i] = "cash"} else{final_finalV2$Ncash[i] = NA}
        } 
      }
    }
  }
}

# convert variables to factors and relevel for future regression models: 
final_finalV2$cash <- as.factor(final_finalV2$cash)
final_finalV2$cash_cat <- relevel(final_finalV2$cash, ref = "placebo")
# convert variables to factors and relevel for future regression models: 
final_finalV2$Vcash <- as.factor(final_finalV2$Vcash)
final_finalV2$Vcash_cat <- relevel(final_finalV2$Vcash, ref = "placebo")
# convert variables to factors and relevel for future regression models: 
final_finalV2$Ncash <- as.factor(final_finalV2$Ncash)
final_finalV2$Ncash_cat <- relevel(final_finalV2$Ncash, ref = "placebo")

# relevel individual treatment
final_finalV2$individual_treatment <- as.factor(final_finalV2$individual_treatment)
final_finalV2$individual_treatment <- relevel(final_finalV2$individual_treatment, ref = "Placebo")

# relevel village treatment
final_finalV2$VillageTreatment <- as.factor(final_finalV2$VillageTreatment)
final_finalV2$VillageTreatment <- relevel(final_finalV2$VillageTreatment, ref = "Placebo")

#additional variable Gender45y related to the BART analysis
# this variable is used to compare estimated beta coefficients 
final_finalV2$Gender45y = ifelse(final_finalV2$Gender == "Female" & final_finalV2$Q2.2 < 45,
                                 "Female below 45", ifelse(final_finalV2$Gender == "Female" & final_finalV2$Q2.2 >= 45, "Female above 45",
                                                           ifelse(final_finalV2$Gender == "Male" & final_finalV2$Q2.2 < 45, "Male below 45",
                                                                  ifelse(final_finalV2$Gender == "Male" & final_finalV2$Q2.2 >= 45, "Male above 45",NA))))
#table(final_finalV2$Gender45y)

# Age variable 
final_finalV2$Age = as.numeric(final_finalV2$Q2.2)

# Full time employment: 
final_finalV2$Employed = ifelse(final_finalV2$Q143 == "Employed (full time)", 1, 0)
# Income Variable/Avg Spending food (needed for Social network)
final_finalV2$Income = as.numeric(final_finalV2$Q144)
final_finalV2$Income[which(final_finalV2$Income>840)] = NA
# Villages Visited last year: 
final_finalV2$VillagesYear = ifelse(!is.na(final_finalV2$p_ii_Q26), final_finalV2$p_ii_Q26, final_finalV2$p_iii_Q26)
# villages visited last month: 
final_finalV2$VillagesMonth = ifelse(!is.na(final_finalV2$p_ii_Q25), final_finalV2$p_ii_Q25, final_finalV2$p_iii_Q25)
# having family in other villages: 
final_finalV2$FamilyVillages = ifelse(!is.na(final_finalV2$p_ii_Q27), final_finalV2$p_ii_Q27, final_finalV2$p_iii_Q27)
final_finalV2$FamilyVillages = ifelse(final_finalV2$FamilyVillages == "Yes", 1, 
                                      ifelse(final_finalV2$FamilyVillages == "No", 0 , NA))
# Whatsapp 
final_finalV2$WhatsApp = ifelse(!is.na(final_finalV2$p_ii_Q2.2), final_finalV2$p_ii_Q2.2, final_finalV2$p_iii_Q2.2)
final_finalV2$WhatsApp = ifelse(final_finalV2$WhatsApp == "Yes",1, 
                                ifelse(final_finalV2$WhatsApp == "No", 0,NA ))

# New variable: Social network metric based on:
final_finalV2$ID = c(1:dim(final_finalV2)[1])

pca_sample = final_finalV2[,c("ID","VillagesMonth", "VillagesYear",
                              "FamilyVillages","WhatsApp")]
pca_sample = na.omit(pca_sample)

princ = prcomp(pca_sample[,-1])
out_pca = cbind(pca_sample$ID,as.data.frame(princ$x[,'PC1']))
colnames(out_pca)[c(1,2)] = c("ID", "SNMetric")

final_finalV2 = left_join(final_finalV2, out_pca, by = "ID")

final_finalV2$Education <- as.factor(final_finalV2$Education)
final_finalV2$Education <- relevel(final_finalV2$Education, ref = "Never attended")

require(excel.link)
# first file: 
cf_data1 <- xl.read.file("Confidential/AOB_District 2.xlsx", 
                         password = "xxxxxx")
# to select file manually 
#cf_data1 = read_excel(file.choose())

# adjust date formats: 
for(i in 1:dim(cf_data1)[1]){
  cf_data1$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = as.numeric(strsplit(as.character(cf_data1[i,29]), split = " ")[[1]][1])
}

# 4th file from 31.12: 
cf_data4 <- xl.read.file("Confidential/updated TWIFO ATI MORKWA District 3.xlsx", 
                         password = "xxxxxxx")
# to select file manually: 
#cf_data4 = read_excel(file.choose())

#5th file: 
cf_data5 <- xl.read.file("Confidential/District 6.xlsx", 
                         password = "xxxxxxx")

#6th file:
library(rio)
cf_data6 <- import_list("Confidential/District_1_Gomoa_West_clustered ORIG.xlsx", setclass = "tbl", rbind = TRUE)
cf_data6 = cf_data6[,-c(31,32)]

#7th file:
library(rio)
cf_data7 <- import_list("Confidential/District_4_Assin_North_clustered.xlsx", 
                        setclass = "tbl", rbind = TRUE)
cf_data7 = cf_data7[-c(1),-c(31,32)]

cf_data8 <- xl.read.file("Confidential/District 5_ASUOGYAMAN.xlsx", 
                         password = "xxxxxxxxxx")

# adjust dates that files can be merged: 
for(i in 1:dim(cf_data4)[1]){
  cf_data4$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = as.numeric(strsplit(as.character(cf_data4[i,31]), split = "-")[[1]][2])
}
for(i in 1:dim(cf_data4)[1]){
  cf_data4$`DATE OF FIRST COVID-19 VACCINATION (day)`[i] = as.numeric(strsplit(as.character(cf_data4[i,31]), split = "-")[[1]][1])
}
for(i in 1:dim(cf_data4)[1]){
  cf_data4$`DATE OF FIRST COVID-19 VACCINATION (year)`[i] = as.numeric(strsplit(as.character(cf_data4[i,31]), split = "-")[[1]][3])
}
cf_data4[,31] = NULL

# merge only if neccesary 
cf_data = rbind(cf_data1, cf_data4) # district 3 
#dim(cf_data)

cf_data = rbind(cf_data, cf_data5) # district 6
cf_data = rbind(cf_data, cf_data6) # district 1 
dim(cf_data)
cf_data = rbind(cf_data, cf_data7) # district 4
cf_data = rbind(cf_data, cf_data8) # disctrict 5

# Remove duplciates 
dups = cf_data$SubjectID[which(duplicated(cf_data$SubjectID) == TRUE)]
# removing duplicates: 
cf_data = cf_data[-which(cf_data$SubjectID %in% dups),]

### fix CF data:
cf_issues = cf_data[which(cf_data$`DATE OF FIRST COVID-19 VACCINATION (day)` %in% c("YES", "Yes")),]
cf_issues[1,c(27,28,29,30)]= c("Yes", NA, "2", "2022")
cf_issues[2,c(27,28,29,30)]= c("Yes", "2", "2", "2022")
cf_issues[3,c(27,28,29,30)]= c("Yes", "27", "2", "2022")
cf_issues[4,c(27,28,29,30)]= c("Yes", "3", "8", "2022")
cf_issues[5,c(27,28,29,30)]= c("Yes", "27", "2", "2022")
cf_issues[6,c(27,28,29,30)]= c("Yes", "18", "2", "2022")
cf_issues[7,c(27,28,29,30)]= c("Yes", "6", "6", "2022")
cf_issues[8,c(27,28,29,30)]= c("Yes", "2", "2", "2022")
cf_issues[9,c(27,28,29,30)]= c("Yes", "8", "3", "2022")
cf_issues[10,c(27,28,29,30)]= c("Yes", "25", "2", "2022")
cf_issues[11,c(27,28,29,30)]= c("Yes", "12", "9", "2022")
cf_issues[12,c(27,28,29,30)]= c("Yes", "13", "3", "2022")
cf_issues[13,c(27,28,29,30)]= c("Yes", "11", "8", "2022")
cf_issues[14,c(27,28,29,30)]= c("Yes", "26", "6", "2022")
cf_issues[15,c(27,28,29,30)]= c("Yes", "5", "2", "2022")
cf_issues[16,c(27,28,29,30)]= c("Yes", "5", "2", "2022")

cf_data = cf_data[-which(cf_data$`DATE OF FIRST COVID-19 VACCINATION (day)` %in% c("YES", "Yes")),]
cf_data = rbind(cf_data, cf_issues)

for(i in 1:length(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`)){
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% c("1", "2", "3", "4", "5", "6", "7", "8", "9", "10", "11", "12"))
  {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i]}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i]  %in% "7 (July)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "7"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "12 (December)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "12"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "1 (January)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "1"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "2 (February)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "2"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "6 (June)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "6"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "4 (April)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "4"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "8 (August)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "8"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "11 (November)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "11"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "10 (October)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "10"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "3 (March)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "3"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "5 (May)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "5"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "9 (September)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "9"}
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in% "1 (January)") {cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] = "1"}
}

cf_data$ActVacJuly = NA
for(i in 1: dim(cf_data)[1]){
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in%  c("1", "2", "3" ,"4", "5", "6", "7")){
    cf_data$ActVacJuly[i] = cf_data$`At least one COVID-19 vaccination? (Yes/No/No Information)`[i]
  }else{
    cf_data$ActVacJuly[i] = ifelse(cf_data$`At least one COVID-19 vaccination? (Yes/No/No Information)`[i] %in% c("Yes", "No"), "No", NA)
  }
}
cf_data$ActVacDec = NA
for(i in 1: dim(cf_data)[1]){
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in%  c("1", "2", "3" ,"4", "5", "6", "7", "8", "9", "10", "11", "12")){
    cf_data$ActVacDec[i] = cf_data$`At least one COVID-19 vaccination? (Yes/No/No Information)`[i]
  }else{
    cf_data$ActVacDec[i] = ifelse(cf_data$`At least one COVID-19 vaccination? (Yes/No/No Information)`[i] %in% c("Yes", "No"), "No", NA)
  }
}
cf_data$ActVacApril = NA
for(i in 1: dim(cf_data)[1]){
  if(cf_data$`DATE OF FIRST COVID-19 VACCINATION (month)`[i] %in%  c("1", "2", "3" ,"4")){
    cf_data$ActVacApril[i] = cf_data$`At least one COVID-19 vaccination? (Yes/No/No Information)`[i]
  }else{
    cf_data$ActVacApril[i] = ifelse(cf_data$`At least one COVID-19 vaccination? (Yes/No/No Information)`[i] %in% c("Yes", "No"), "No", NA)
  }
}
dim(cf_data)
table(cf_data$`District number`)
# final combined file: 
combined_file = left_join(final_finalV2, cf_data, by = "SubjectID" ) 

final_finalV2 = combined_file
final_finalV2$ActVacDec = ifelse(final_finalV2$ActVacDec %in% c("Yes", "YES", "yes"), 1, ifelse(final_finalV2$ActVacDec %in% c("No", "NO", "no"), 0 , NA))
final_finalV2$ActVacJuly = ifelse(final_finalV2$ActVacJuly %in% c("Yes", "YES", "yes"), 1, ifelse(final_finalV2$ActVacJuly %in% c("No", "NO", "no"), 0 , NA))
final_finalV2$ActVacApril = ifelse(final_finalV2$ActVacApril %in% c("Yes", "YES", "yes"), 1, ifelse(final_finalV2$ActVacApril %in% c("No", "NO", "no"), 0 , NA))

### add distances: 
clinics = read.csv(file = "Data/10022023clinics.csv")

library(geosphere)
library(DescTools)

final_finalV2$clinic_distance = NA
for(i in 1:dim(final_finalV2)[1]){
  selected_individual = final_finalV2[i,]
  if(is.na(selected_individual[,c("LocationLongitude")]) == FALSE & 
     is.na(selected_individual[,c("LocationLatitude")]) == FALSE){
    distances_vector = as.data.frame(as.numeric(sapply(1:dim(clinics)[1],
                                                       FUN = function(x) distm(c(as.vector(selected_individual[,c("LocationLongitude")]),selected_individual[,c("LocationLatitude")]), 
                                                                               c(as.vector(clinics[x,c("Longitude")]),clinics[x,c("Latitude")]),fun = distHaversine))))
    position = which.min(as.vector(as.numeric(unlist(as.vector(distances_vector)))))
    final_finalV2$clinic_distance[i] = as.vector(as.numeric(unlist(as.vector(distances_vector))))[position]/1000
  }else{
    final_finalV2$clinic_distance[i] = NA
  }
}

# add population numbers: 
population = read.csv(file = "Data/VillagePopulationTablesV2.csv") 
colnames(population)[c(1,5)] = c("Q123", "Village_Population")
population$Village_Population = round(population$Village_Population)
population = population[,c(1,5)]

# merge 
semi = left_join(final_finalV2, population, by = "Q123" ) 
final_finalV2 = semi

# removing villages with issues: 
final_finalV2$NcashV2 = NA
for(i in 1:length(final_finalV2$NcashV2)){
  if(final_finalV2$Ncash[i]  == "cash_placebo"){
    if(final_finalV2$VillageTreatment[i]  == "Low Cash")
    {final_finalV2$NcashV2[i] = "Lowcash_placebo"}
    if(final_finalV2$VillageTreatment[i]  == "High Cash")
    {final_finalV2$NcashV2[i] = "Highcash_placebo"}
  }else{
    final_finalV2$NcashV2[i]  = as.character(final_finalV2$Ncash[i]) 
  }
}
table(final_finalV2$NcashV2)

for(i in 1:length(final_finalV2$NcashV2)){
  if(final_finalV2$NcashV2[i]  == "cash"){
    if(final_finalV2$VillageTreatment[i]  == "Low Cash")
    {final_finalV2$NcashV2[i] = "LowCash"}
    if(final_finalV2$VillageTreatment[i]  == "High Cash")
    {final_finalV2$NcashV2[i] = "HighCash"}
  }else{
    final_finalV2$NcashV2[i]  = as.character(final_finalV2$NcashV2[i]) 
  }
}
(table(final_finalV2$NcashV2))

problematic_villages = unique(final_finalV2$Q123[which(final_finalV2$NcashV2 == "cash")])
semi2 = final_finalV2[-which(final_finalV2$Q123 %in% problematic_villages),]
dim(semi2)
dim(final_finalV2)
final_finalV2 = semi2
(table(final_finalV2$NcashV2))

# convert variables to factors and relevel for future regression models: 
final_finalV2$NcashV2 <- as.factor(final_finalV2$NcashV2)
final_finalV2$Ncash_catV2 <- relevel(final_finalV2$NcashV2, ref = "placebo")

# Districts dummy variables: 
final_finalV2$dist1 = ifelse(final_finalV2$`District number` == "1", 1, 0)
final_finalV2$dist2 = ifelse(final_finalV2$`District number` == "2", 1, 0)
final_finalV2$dist3 = ifelse(final_finalV2$`District number` == "3", 1, 0)
final_finalV2$dist4 = ifelse(final_finalV2$`District number` == "4", 1, 0)
final_finalV2$dist5 = ifelse(final_finalV2$`District number` == "5", 1, 0)
final_finalV2$dist6 = ifelse(final_finalV2$`District number` == "6", 1, 0)

# add population categories: 
final_finalV2$Village_Population_Size = ifelse(final_finalV2$Village_Population<503, "Small", 
                                               ifelse(final_finalV2$Village_Population>= 503 & final_finalV2$Village_Population<1268, "Mid", 
                                                      ifelse(final_finalV2$Village_Population>= 1268, "Large", NA)))

# Export and ave the file used for the analysis: 
save(final_finalV2, file = "Data/FinalFinal20062023Full1-6CONFIDENTIAL.RData")

final_finalV2 = final_finalV2[,-c(5,6,55,56,58,59,
                                  139,140,142,143,144,145,146,147,148,152,158,159,160,161,162,163,
                                  231,232,235,237,238,242,248,254,255,256,257,258,259,
                                  337,338,339,340,341,342,343,368,369,376,377,378,379,380,381,382,383,384,385,386,387,388,389,390,391,392)]
save(final_finalV2, file = "Data/FinalFinal20062023Full1-6.RData")


## Spillover file: 

# Load data: 
data <- read_csv("Data/spillover_treated.csv")[-c(1, 2), ]
village_id <- read_csv("Data/master_final_227.csv")

# Data preparation: 
data$Q123 <- as.numeric(data$Q123)
data <- left_join(data,village_id,by="Q123")
names(data)[names(data) == "T.y"] <- "T"

# rename the main data set:
final_finalSP = data

## New variables for the SPILLOVER
# Attended - measure of level of education:
final_finalSP$Attended = ifelse(final_finalSP$Q10.11 == "Never attended", 1, 0)
# Gender variable: 
final_finalSP$Gender = ifelse(final_finalSP$Q10.3 == "Prefer not to say", NA,
                              ifelse(final_finalSP$Q10.3 == "Female", "Female",
                                     ifelse(final_finalSP$Q10.3 == "Male", "Male", NA)))
# Vaccine Status - reported 
colnames(final_finalSP)[which(colnames(final_finalSP) =="Vaccine_One")] = "vaccine_reported"
final_finalSP$vaccine_reported = ifelse(final_finalSP$vaccine_reported == "Yes, I have received at least one shot a COVID-19 vaccine", 1 ,
                                        ifelse(final_finalSP$vaccine_reported == "No, I have not received at least one shot a COVID-19 vaccine", 0 , NA))

# Create cash variable on village level:
# convert Low Cash and High Cash into "cash" variable
final_finalSP$Vcash = ifelse(final_finalSP[,"T"] == "Placebo", "placebo" ,
                             ifelse(final_finalSP[,"T"] == "CDC health", "CDC" ,
                                    ifelse(final_finalSP[,"T"] == "Low Cash", "cash",
                                           ifelse(final_finalSP[,"T"] == "High Cash", "cash", NA))))


# convert variables to factors and relevel for future regression models: 
final_finalSP$Vcash <- as.factor(final_finalSP$Vcash)
final_finalSP$Vcash_cat <- relevel(final_finalSP$Vcash, ref = "placebo")

# convert Low Cash and High Cash into "cash" variable
final_finalSP$VcashV2 = ifelse(final_finalSP[,"T"] == "Placebo", "placebo" ,
                             ifelse(final_finalSP[,"T"] == "CDC health", "CDC" ,
                                    ifelse(final_finalSP[,"T"] == "Low Cash", "LowCash",
                                           ifelse(final_finalSP[,"T"] == "High Cash", "HighCash", NA))))


# convert variables to factors and relevel for future regression models: 
final_finalSP$VcashV2 <- as.factor(final_finalSP$VcashV2)
final_finalSP$Vcash_catV2 <- relevel(final_finalSP$VcashV2, ref = "placebo")

# relevel village treatment
final_finalSP$T <- as.factor(final_finalSP$T)
final_finalSP$T <- relevel(final_finalSP$T, ref = "Placebo")

# New variable Education: 
Never = c("Never attended")
Low = c("Nursery","Kindergarten","Primary","JSS/JHS")
Medium <- c("Middle","SSS/SHS","Vocational/Technical/Commercial","Post middle/secondary certificate",
            "Post-secondary diploma")
High <- c("Bachelor degree","Post graduate (Cert. Diploma Masters PHD etc)")
final_finalSP$Education = ifelse(final_finalSP$Q10.11 %in% Never, "Never attended",
                                 ifelse(final_finalSP$Q10.11 %in% Low, "Low",
                                        ifelse(final_finalSP$Q10.11 %in% Medium, "Medium", 
                                               ifelse(final_finalSP$Q10.11 %in% High, "High", NA))))

final_finalSP$Education <- as.factor(final_finalSP$Education)
final_finalSP$Education <- relevel(final_finalSP$Education, ref = "Never attended")

# New variable age: 
final_finalSP$Age = as.numeric(final_finalSP$Q10.2)

# Employed (1/0): 
final_finalSP$Employed = ifelse(final_finalSP$Q10.7 == "Employed (full time)", 1, 0)

# Gener variable with 4 categories F/M 45
final_finalSP$Gender45y = ifelse(final_finalSP$Gender == "Female" & final_finalSP$Q2.2 < 45,
                                 "Female below 45", ifelse(final_finalSP$Gender == "Female" & final_finalSP$Age >= 45, "Female above 45",
                                                           ifelse(final_finalSP$Gender == "Male" & final_finalSP$Age < 45, "Male below 45",
                                                                  ifelse(final_finalSP$Gender == "Male" & final_finalSP$Age >= 45, "Male above 45",NA))))

# Income Variable/Avg Spending food (needed for Social network)
final_finalSP$Income = as.numeric(final_finalSP$Q10.8)
final_finalSP$Income[which(final_finalSP$Income>840)] = NA

# Villages Visited last year: 
final_finalSP$VillagesYear = as.numeric(final_finalSP$Village_Visit_Year)
# villages visited last month: 
final_finalSP$VillagesMonth = as.numeric(final_finalSP$Village_Visit_Month)
# Whatsapp: 
final_finalSP$WhatsApp = ifelse(final_finalSP$WhatsApp == "Yes",1, 
                                ifelse(final_finalSP$WhatsApp == "No", 0,NA ))
# Family Villages: 
final_finalSP$FamilyVillages = ifelse(final_finalSP$Village_Family == "Yes",1, 
                                      ifelse(final_finalSP$Village_Family == "No", 0,NA ))

# Social Metric based on the PCA: 
final_finalSP$ID = c(1:dim(final_finalSP)[1])

pca_sample = final_finalSP[,c("ID","VillagesMonth", "VillagesYear",
                              "FamilyVillages","WhatsApp")]
pca_sample = na.omit(pca_sample)

princ = prcomp(pca_sample[,-1])
out_pca = cbind(pca_sample$ID,as.data.frame(princ$x[,'PC1']))
colnames(out_pca)[c(1,2)] = c("ID", "SNMetric")

final_finalSP = left_join(final_finalSP, out_pca, by = "ID")

# Left join
combined_file2 = left_join(final_finalSP, cf_data, by = "SubjectID" ) 
final_finalSP = combined_file2
final_finalSP$ActVacApril = ifelse(final_finalSP$ActVacApril %in% c("Yes", "YES", "yes"), 1,
                                   ifelse(final_finalSP$ActVacApril == "No", 0 , NA))

### add distances: 
clinics = read.csv(file = "Data/10022023clinics.csv")

library(geosphere)
library(DescTools)

final_finalSP$clinic_distance = NA
for(i in 1:dim(final_finalSP)[1]){
  selected_individual = final_finalSP[i,]
  if(is.na(selected_individual[,c("LocationLongitude")]) == FALSE & 
     is.na(selected_individual[,c("LocationLatitude")]) == FALSE){
    distances_vector = as.data.frame(as.numeric(sapply(1:dim(clinics)[1],
                                                       FUN = function(x) distm(c(as.vector(as.numeric(selected_individual[,c("LocationLongitude")])),as.numeric(selected_individual[,c("LocationLatitude")])), 
                                                                               c(as.vector(clinics[x,c("Longitude")]),clinics[x,c("Latitude")]),fun = distHaversine))))
    position = which.min(as.vector(as.numeric(unlist(as.vector(distances_vector)))))
    final_finalSP$clinic_distance[i] = as.vector(as.numeric(unlist(as.vector(distances_vector))))[position]/1000
  }else{
    final_finalSP$clinic_distance[i] = NA
  }
}

dim(final_finalSP)

# merge 
semi3 = left_join(final_finalSP, population, by = "Q123" ) 
final_finalSP = semi3

# remove problematic villages: 
semi4 = final_finalSP[-which(final_finalSP$Q123 %in% problematic_villages),]
dim(semi4)
dim(final_finalSP)
final_finalSP = semi4

# Dummy variables for districts: 
final_finalSP$dist1 = ifelse(final_finalSP$`District number` == "1", 1, 0)
final_finalSP$dist2 = ifelse(final_finalSP$`District number` == "2", 1, 0)
final_finalSP$dist3 = ifelse(final_finalSP$`District number` == "3", 1, 0)
final_finalSP$dist4 = ifelse(final_finalSP$`District number` == "4", 1, 0)
final_finalSP$dist5 = ifelse(final_finalSP$`District number` == "5", 1, 0)
final_finalSP$dist6 = ifelse(final_finalSP$`District number` == "6", 1, 0)

# add village size categories: 
final_finalSP$Village_Population_Size = ifelse(final_finalSP$Village_Population<503, "Small", 
                                               ifelse(final_finalSP$Village_Population>= 503 & final_finalSP$Village_Population<1268, "Mid", 
                                                      ifelse(final_finalSP$Village_Population>= 1268, "Large", NA)))


# Export and save the Final_finalSP file:
save(final_finalSP, file = "Data/FinalFinalSP20062023Full1-6Confidential.RData")

final_finalSP = final_finalSP[,-c(4,10,11,12,13,14,15,24,25,27,28,
                  174,175,182,183,184,185,186,187,188,189,190,191,192,193,194,195,196,197,198)]

save(final_finalSP, file = "Data/FinalFinalSP20062023Full1-6.RData")
























